SOCR ≫ DSPA ≫ DSPA2 Topics ≫

1 Packages

library(utility)
library(progress)
library(data.table)
library(Rtsne)
library(ggplot2)
library(plotly)
library(lubridate)
library(dummies)
library(umap)
library(dplyr)
library(missForest)

2 Data Processing

total_cols: 22940L total rows: 502493

initial_read <- fread("ukbb_data/ukb44534_compiled_tab-001.csv", nrows = 1)
total_cols <- ncol(initial_read)
total_rows <- as.numeric(system("wc -l < ukbb_data/ukb44534_compiled_tab-001.csv", intern=TRUE))

This code block sets up parallel processing capabilities, defines file paths, and initializes progress tracking. It includes a function to process each column of the dataset, determining the data type and saving summaries.

library(data.table)
library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
cores <- detectCores() - 1
registerDoParallel(cores)

data_path <- "ukbb_data/1000_rows.csv"
progress_path <- "progress_log.csv"
total_cols <- 29L

plot_dir <- paste0("plots/")
if (!dir.exists(plot_dir)) {
  dir.create(plot_dir, recursive = TRUE)
  }

summary_dir <- paste0("summaries/")
if (!dir.exists(summary_dir)) {
  dir.create(summary_dir, recursive = TRUE)
  }
if (file.exists(progress_path)) {
  progress <- fread(progress_path)
  start_col <- max(progress$col_processed) + 1
} else {
  fwrite(data.table(col_processed = integer(0)), progress_path)
  start_col <- 2  
}

process_column <- function(col) {
  column_titles <- fread(data_path, nrows = 1)
  col_title <- names(column_titles)[col]
  current_col_sample <- fread(data_path, select = col, nrows = 30)
  detected_type <- "Other"
  sample_values <- na.omit(current_col_sample[[1]])
  if (length(sample_values) == 0) {
    detected_type <- "Empty"
  } else if (all(is.numeric(sample_values))) {
    if (all(sample_values %in% c(0, 1))) {
      detected_type <- "Binary"
    } else {
      detected_type <- "Numeric"
    }
  } else if (any(grepl("^\\d{4}-\\d{2}-\\d{2}$", sample_values))) {
    detected_type <- "Date"
  } else {
    detected_type <- "Categorical"
    current_col <- fread(data_path, select = col)
    freq <- table(current_col[[1]], useNA = "ifany")
    freq <- as.data.frame(freq)
    freq$Prop <- freq$Freq / sum(freq$Freq)
    fwrite(freq, paste0("summaries/", col_title, ".csv"))
  }

  fwrite(data.table(col_processed = col), progress_path, append = TRUE)
  return(list(colname = col_title, type = detected_type))
}

results <- foreach(col = start_col:total_cols, .combine = 'rbind', .packages = 'data.table') %dopar% {
  process_column(col)
}

results <- as.data.table(results)
fwrite(results, "column_types_summary.csv")

print(results)
##                                                                         colname
##  1:                                          peak_expiratory_flow_pef_f3064_2_1
##  2:                                               triplet_played_left_f4229_0_9
##  3:                                              triplet_entered_left_f4236_0_8
##  4:                                       target_number_to_be_entered_f4252_1_1
##  5:                                    number_entered_by_participant_f4258_0_12
##  6:                                visual_acuity_result_in_round_left_f5082_1_8
##  7:                                                   ecg_stage_name_f5988_1_27
##  8:                           total_peripheral_resistance_during_pwa_f12685_2_0
##  9:                                                   operation_code_f20004_2_4
## 10: method_of_recording_time_when_noncancer_illness_first_diagnosed_f20013_2_11
## 11:                       workplace_full_of_chemical_or_other_fumes_f22610_0_24
## 12:                         breathing_problems_during_period_of_job_f22616_0_19
## 13:                                                   values_wanted_f23321_2_78
## 14:                                                  values_wanted_f23321_2_104
## 15:                                                   values_wanted_f23321_3_16
## 16:          mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## 17:        mean_icvf_in_anterior_corona_radiata_on_fa_skeleton_right_f25366_3_0
## 18:           mean_od_in_superior_corona_radiata_on_fa_skeleton_left_f25417_3_0
## 19:                   volume_of_molecularlayerhphead_left_hemisphere_f26629_3_0
## 20:                    mean_thickness_of_precentral_right_hemisphere_f27288_3_0
## 21:                   volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## 22:                                               urea_reportability_f30676_1_0
## 23:                                      histology_of_cancer_tumour_f40011_16_0
## 24:                                                      external_ca_f41201_0_6
## 25:                       main_speciality_of_consultant_polymorphic_f41207_0_11
## 26:                    date_of_first_inpatient_diagnosis_main_icd10_f41262_0_33
## 27:                         date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## 28:                                          typical_diet_yesterday_f100020_4_0
##                                                                         colname
##            type
##  1:     Numeric
##  2:     Numeric
##  3:     Numeric
##  4:       Empty
##  5:       Empty
##  6:       Empty
##  7: Categorical
##  8:     Numeric
##  9:       Empty
## 10:       Empty
## 11:       Empty
## 12:       Empty
## 13:       Empty
## 14:       Empty
## 15:     Numeric
## 16:     Numeric
## 17:       Empty
## 18:       Empty
## 19:       Empty
## 20:       Empty
## 21:     Numeric
## 22:      Binary
## 23:       Empty
## 24:       Empty
## 25:       Empty
## 26:       Empty
## 27:        Date
## 28:      Binary
##            type

This code block sets up for data visualization by reading a summary file and preparing a pie chart to visualize the distribution of data types.

file_path <- "column_types_summary.csv" 
data <- fread(file_path)


type_counts <- table(data$type)
type_counts_df <- as.data.frame(type_counts)
names(type_counts_df) <- c("Type", "Frequency")
type_counts_df$Prop <- type_counts_df$Frequency / sum(type_counts_df$Frequency)


if (nrow(type_counts_df) > 1) {
  pie_colors <- rainbow(nrow(type_counts_df))
  png(file = "type_distribution_pie_chart.png")
  pie(type_counts_df$Frequency, labels = paste0(type_counts_df$Type, ": ", round(type_counts_df$Prop * 100, 1), "%"), 
      col = pie_colors, main = "Pie Chart of Data Types")
  dev.off()
} else {
  cat("Not enough data to plot a pie chart.\n")
}
## quartz_off_screen 
##                 2

3 Data Processing

Read subset for toy analysis

data <- fread('ukbb_data/1000_rows.csv', na.strings = c("", "NA", "na"))
data <- convert_dates_to_numeric(data)
## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.
data <- apply_label_encoding(data)
data <- normalize_df(data)
data <- data %>% select(which(colSums(is.na(data)) != nrow(data)) %>% names())  
data <- missForest(data, verbose = TRUE)$ximp
##   missForest iteration 1 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?
## done!
##     estimated error(s): NaN 
##     difference(s): 0.04060135 
##     time: 0.026 seconds
## 
##   missForest iteration 2 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?
## done!
##     estimated error(s): NaN 
##     difference(s): 0.02085736 
##     time: 0.024 seconds
## 
##   missForest iteration 3 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?
## done!
##     estimated error(s): NaN 
##     difference(s): 0.01624264 
##     time: 0.027 seconds
## 
##   missForest iteration 4 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?
## done!
##     estimated error(s): NaN 
##     difference(s): 0.01359888 
##     time: 0.029 seconds
## 
##   missForest iteration 5 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?
## done!
##     estimated error(s): NaN 
##     difference(s): 0.008524644 
##     time: 0.024 seconds
## 
##   missForest iteration 6 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?
## done!
##     estimated error(s): NaN 
##     difference(s): 0.01154912 
##     time: 0.023 seconds
data <- unique(data)                                  # remove duplication
data_matrix <- as.matrix(data)            # make sure the data is numeric
data <- data[, .SD, .SDcols = sapply(data, function(x) !all(x == 0))]

4 Analysis

4.1 K-means

This section performs a K-means clustering analysis. It identifies the optimal number of clusters using silhouette scores and show visualization of the of the K index parameter versus

library(cluster)    
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)  # for plotting

set.seed(123) 

optimal_k <- 2
highest_silhouette <- 0
sil_scores <- numeric(29)  # Vector to store average silhouette scores for each k

for(k in 2:30) {
  km.res <- kmeans(data_matrix, centers = k, nstart = 25)
  ss <- silhouette(km.res$cluster, dist(data_matrix))
  avg.sil <- mean(ss[, 3])
  sil_scores[k-1] <- avg.sil  # Store the silhouette score
  
  if(avg.sil > highest_silhouette){
    highest_silhouette <- avg.sil
    optimal_k <- k
  }
}

clusters <- kmeans(data_matrix, centers = optimal_k, nstart = 25)

cat("Optimal number of clusters:", optimal_k, "\n")
## Optimal number of clusters: 3
print(clusters)
## K-means clustering with 3 clusters of sizes 4, 34, 25
## 
## Cluster means:
##   number_of_operations_selfreported_f136_0_0 peak_expiratory_flow_pef_f3064_2_1
## 1                                  0.0937500                          0.5172072
## 2                                  0.2205882                          0.7152751
## 3                                  0.2200000                          0.5399941
##   triplet_played_left_f4229_0_9 triplet_entered_left_f4236_0_8
## 1                     0.7192982                      0.1721622
## 2                     0.4419424                      0.2673173
## 3                     0.4532129                      0.6596246
##   ecg_stage_name_f5988_1_27 total_peripheral_resistance_during_pwa_f12685_2_0
## 1                 0.2425000                                         0.9285518
## 2                 0.1523529                                         0.9095128
## 3                 0.5964000                                         0.5586293
##   values_wanted_f23321_3_16
## 1                         0
## 2                         0
## 3                         0
##   mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## 1                                                          0.7160994
## 2                                                          0.7359227
## 3                                                          0.4031346
##   volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## 1                                                 0.4573078
## 2                                                 0.5419368
## 3                                                 0.2915750
##   urea_reportability_f30676_1_0
## 1                             0
## 2                             0
## 3                             0
##   date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## 1                                           0.6734667
## 2                                           0.7579405
## 3                                           0.2110482
##   typical_diet_yesterday_f100020_4_0
## 1                          0.0000000
## 2                          0.9211765
## 3                          0.8848000
## 
## Clustering vector:
##  [1] 3 3 2 2 2 3 2 3 2 2 2 2 1 2 3 2 2 2 1 2 2 3 1 2 2 2 2 2 2 3 3 2 2 3 3 3 3 2
## [39] 2 3 2 3 2 2 2 3 3 1 3 3 3 2 3 2 3 3 2 3 3 2 2 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 1.323683 6.254919 7.267960
##  (between_SS / total_SS =  54.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Plotting the silhouette score vs. number of clusters
k_values <- 2:30
sil_plot <- ggplot(data = data.frame(k_values, sil_scores), aes(x = k_values, y = sil_scores)) +
            geom_line() +
            scale_x_continuous(breaks = 2:30) +
            theme_minimal() +
            ggtitle("Silhouette Score vs. Number of Clusters") +
            xlab("Number of Clusters (k)") +
            ylab("Average Silhouette Score")

print(sil_plot)

4.2 Dimensional reduction with PCA,t-SNE and UMAP

Here, PCA, t-SNE, and UMAP techniques are applied for dimensionality reduction. The results are prepared for subsequent visualization.

# PCA
non_constant_data_matrix <- data_matrix[, apply(data_matrix, 2, var) != 0]
pca_result_2d <- prcomp(non_constant_data_matrix, scale. = TRUE, center = TRUE)
pca_2d <- pca_result_2d$x[, 1:2]  
pca_3d <- pca_result_2d$x[, 1:3]  

# t-sne
tsne_2d <- Rtsne(data_matrix, dims = 2, perplexity=20, verbose=TRUE)    
## Performing PCA
## Read the 63 x 12 data matrix successfully!
## Using no_dims = 2, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.979592)!
## Learning embedding...
## Iteration 50: error is 47.812786 (50 iterations in 0.00 seconds)
## Iteration 100: error is 54.407321 (50 iterations in 0.00 seconds)
## Iteration 150: error is 47.916957 (50 iterations in 0.00 seconds)
## Iteration 200: error is 52.413016 (50 iterations in 0.00 seconds)
## Iteration 250: error is 51.839231 (50 iterations in 0.00 seconds)
## Iteration 300: error is 1.493987 (50 iterations in 0.00 seconds)
## Iteration 350: error is 0.925483 (50 iterations in 0.00 seconds)
## Iteration 400: error is 0.468466 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.326026 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.234702 (50 iterations in 0.00 seconds)
## Iteration 550: error is 0.216714 (50 iterations in 0.00 seconds)
## Iteration 600: error is 0.168604 (50 iterations in 0.00 seconds)
## Iteration 650: error is 0.152610 (50 iterations in 0.00 seconds)
## Iteration 700: error is 0.146334 (50 iterations in 0.00 seconds)
## Iteration 750: error is 0.136861 (50 iterations in 0.00 seconds)
## Iteration 800: error is 0.137959 (50 iterations in 0.00 seconds)
## Iteration 850: error is 0.140215 (50 iterations in 0.00 seconds)
## Iteration 900: error is 0.139350 (50 iterations in 0.00 seconds)
## Iteration 950: error is 0.138507 (50 iterations in 0.00 seconds)
## Iteration 1000: error is 0.139166 (50 iterations in 0.00 seconds)
## Fitting performed in 0.05 seconds.
tsne_3d <- Rtsne(data_matrix, dims = 3, perplexity=20, verbose=TRUE)  
## Performing PCA
## Read the 63 x 12 data matrix successfully!
## Using no_dims = 3, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.979592)!
## Learning embedding...
## Iteration 50: error is 46.745493 (50 iterations in 0.00 seconds)
## Iteration 100: error is 48.739883 (50 iterations in 0.00 seconds)
## Iteration 150: error is 47.681479 (50 iterations in 0.00 seconds)
## Iteration 200: error is 49.399400 (50 iterations in 0.00 seconds)
## Iteration 250: error is 47.451295 (50 iterations in 0.00 seconds)
## Iteration 300: error is 0.857982 (50 iterations in 0.00 seconds)
## Iteration 350: error is 0.287598 (50 iterations in 0.00 seconds)
## Iteration 400: error is 0.156503 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.125532 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.117372 (50 iterations in 0.00 seconds)
## Iteration 550: error is 0.113466 (50 iterations in 0.00 seconds)
## Iteration 600: error is 0.109056 (50 iterations in 0.00 seconds)
## Iteration 650: error is 0.107365 (50 iterations in 0.00 seconds)
## Iteration 700: error is 0.105148 (50 iterations in 0.00 seconds)
## Iteration 750: error is 0.104121 (50 iterations in 0.00 seconds)
## Iteration 800: error is 0.105026 (50 iterations in 0.00 seconds)
## Iteration 850: error is 0.104542 (50 iterations in 0.00 seconds)
## Iteration 900: error is 0.104130 (50 iterations in 0.00 seconds)
## Iteration 950: error is 0.103545 (50 iterations in 0.00 seconds)
## Iteration 1000: error is 0.105916 (50 iterations in 0.00 seconds)
## Fitting performed in 0.07 seconds.
# umap
umap_result_2d <- umap(data_matrix, n_components = 2)                  
umap_result_3d <- umap(data_matrix, n_components = 3)                  

4.3 PCA 2D and 3D Visualization

pca_2d_df <- as.data.frame(pca_2d)               # PCA 2d
pca_2d_df$cluster <- as.factor(clusters$cluster)
ggplot(pca_2d_df, aes(x = PC1, y = PC2, color = cluster)) + geom_point() + theme_minimal() + ggtitle("PCA 2D Visualization with Cluster Coloring")

pca_3d_df <- as.data.frame(pca_3d)
pca_3d_df$cluster <- as.factor(clusters$cluster)
fig_pca_3d_cluster <- plot_ly(pca_3d_df, x = ~PC1, y = ~PC2, z = ~PC3, color = ~cluster, colors = RColorBrewer::brewer.pal(3,"Dark2"), type = 'scatter3d', mode = 'markers')
fig_pca_3d_cluster

4.4 t-SNE 2D and 3D Visualization

data_2d <- as.data.frame(tsne_2d$Y)               # t-sne 2d 
data_2d$cluster <- as.factor(clusters$cluster)    # add cluster result
data_3d <- as.data.frame(tsne_3d$Y)
data_3d$cluster <- as.factor(clusters$cluster)
ggplot(data_2d, aes(x = V1, y = V2, color = cluster)) + geom_point() + theme_minimal() +   ggtitle("t-SNE 2D Visualization with Cluster Coloring")

fig <- plot_ly(data_3d, x = ~V1, y = ~V2, z = ~V3, color = ~cluster, colors = RColorBrewer::brewer.pal(3,"Dark2"), type = 'scatter3d', mode = 'markers')
fig

4.5 UMAP 2D and 3D Visualization

data_3d <- as.data.frame(tsne_3d$Y)
data_3d$cluster <- as.factor(clusters$cluster)
data_2d <- data.frame(X = umap_result_2d$layout[,1], Y = umap_result_2d$layout[,2])  # umap 2d
data_2d$cluster <- as.factor(clusters$cluster)
ggplot(data_2d, aes(x = X, y = Y, color = cluster)) + 
  geom_point() + 
  theme_minimal() + 
  ggtitle("UMAP 2D Visualization with Cluster Coloring")

data_3d <- data.frame(X = umap_result_3d$layout[,1], Y = umap_result_3d$layout[,2], Z = umap_result_3d$layout[,3])
data_3d <- data.frame(X = umap_result_3d$layout[,1], Y = umap_result_3d$layout[,2], Z = umap_result_3d$layout[,3])  
data_3d$cluster <- as.factor(clusters$cluster)

fig <- plot_ly(data_3d, x = ~X, y = ~Y, z = ~Z, color = ~cluster, colors = RColorBrewer::brewer.pal(length(unique(data_3d$cluster)),"Dark2"), type = 'scatter3d', mode = 'markers')
fig

5 Feature selection

5.1 t-tests

This code performs feature selection by conducting pairwise t-tests between clusters for each feature in the data matrix, adjusting the p-values using the Benjamini-Hochberg method to identify statistically significant features. It creates a summary table to display significant features and their adjusted p-values.

results_ttest <- list()  
feature_names <- colnames(data_matrix)  

unique_clusters <- unique(clusters$cluster)

for(i in 1:ncol(data_matrix)) {
  feature_results <- list()  
  
  for(j in 1:(length(unique_clusters)-1)) {
    for(k in (j+1):length(unique_clusters)) {
      
      group1 <- data_matrix[clusters$cluster == unique_clusters[j], i]
      group2 <- data_matrix[clusters$cluster == unique_clusters[k], i]
      
      if(var(group1) != 0 && var(group2) != 0) {
        feature_results[[paste(unique_clusters[j], unique_clusters[k], sep="_")]] <- t.test(group1, group2)$p.value
      } else {
        feature_results[[paste(unique_clusters[j], unique_clusters[k], sep="_")]] <- NA
      }
    }
  }
  
  results_ttest[[feature_names[i]]] <- feature_results
}

all_p_values <- sapply(results_ttest, function(feature) {
  min_p_value <- min(unlist(feature), na.rm = TRUE)
  return(ifelse(is.infinite(min_p_value), NA, min_p_value))
})
## Warning in min(unlist(feature), na.rm = TRUE): no non-missing arguments to min;
## returning Inf

## Warning in min(unlist(feature), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
p_adjusted <- p.adjust(all_p_values, method = "BH")
significant_features <- which(p_adjusted < 0.05)

summary_table <- data.frame(
  Feature = feature_names[significant_features],
  P_Value = p_adjusted[significant_features]
)

print(summary_table)
##                                                                                                                               Feature
## peak_expiratory_flow_pef_f3064_2_1                                                                 peak_expiratory_flow_pef_f3064_2_1
## triplet_entered_left_f4236_0_8                                                                         triplet_entered_left_f4236_0_8
## ecg_stage_name_f5988_1_27                                                                                   ecg_stage_name_f5988_1_27
## total_peripheral_resistance_during_pwa_f12685_2_0                                   total_peripheral_resistance_during_pwa_f12685_2_0
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0 mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0                   volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12                               date_of_first_inpatient_diagnosis_icd10_f41280_0_12
##                                                                         P_Value
## peak_expiratory_flow_pef_f3064_2_1                                 1.450792e-05
## triplet_entered_left_f4236_0_8                                     4.876289e-09
## ecg_stage_name_f5988_1_27                                          1.442499e-20
## total_peripheral_resistance_during_pwa_f12685_2_0                  1.367132e-08
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0 2.823986e-11
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0          8.229486e-12
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12                4.692706e-26

5.2 Boruta based random forest

This script applies the Boruta algorithm to identify important features in the data matrix, visualizing their importance using box-and-whisker plots, and then confirms the selection of significant features using the TentativeRoughFix function.

library(Boruta)
library(plotly)
library(tidyr)
library(dplyr)

cluster_numbers <- clusters$cluster
cluster_factors <- as.factor(cluster_numbers)

set.seed(123) # 
data_matrix_df <- as.data.frame(data_matrix)

boruta_output <- Boruta(cluster_factors ~ ., data=data_matrix_df, doTrace = 0)

library(plotly)
# plot(als, xlab="", xaxt="n")
# lz<-lapply(1:ncol(als$ImpHistory), function(i)
# als$ImpHistory[is.finite(als$ImpHistory[, i]), i])
# names(lz)<-colnames(als$ImpHistory)
# lb<-sort(sapply(lz, median))
# axis(side=1, las=2, labels=names(lb), at=1:ncol(als$ImpHistory), cex.axis=0.5, font = 4)

df_long <- tidyr::gather(as.data.frame(boruta_output$ImpHistory), feature, measurement)

plot_ly(df_long, x=~feature, y = ~measurement, color = ~feature, type = "box") %>%
    layout(title="Box-and-whisker Plots across all features (UKBB Data)",
           xaxis = list(title="Features", categoryorder = "total descending"), 
           yaxis = list(title="Importance"), showlegend=F)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
select_result <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
print(select_result)
## Boruta performed 21 iterations in 0.1533029 secs.
##  9 attributes confirmed important:
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12,
## ecg_stage_name_f5988_1_27,
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0,
## peak_expiratory_flow_pef_f3064_2_1,
## total_peripheral_resistance_during_pwa_f12685_2_0 and 4 more;
##  3 attributes confirmed unimportant:
## number_of_operations_selfreported_f136_0_0,
## urea_reportability_f30676_1_0, values_wanted_f23321_3_16;
select_result$finalDecision
##                         number_of_operations_selfreported_f136_0_0 
##                                                           Rejected 
##                                 peak_expiratory_flow_pef_f3064_2_1 
##                                                          Confirmed 
##                                      triplet_played_left_f4229_0_9 
##                                                          Confirmed 
##                                     triplet_entered_left_f4236_0_8 
##                                                          Confirmed 
##                                          ecg_stage_name_f5988_1_27 
##                                                          Confirmed 
##                  total_peripheral_resistance_during_pwa_f12685_2_0 
##                                                          Confirmed 
##                                          values_wanted_f23321_3_16 
##                                                           Rejected 
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0 
##                                                          Confirmed 
##          volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0 
##                                                          Confirmed 
##                                      urea_reportability_f30676_1_0 
##                                                           Rejected 
##                date_of_first_inpatient_diagnosis_icd10_f41280_0_12 
##                                                          Confirmed 
##                                 typical_diet_yesterday_f100020_4_0 
##                                                          Confirmed 
## Levels: Tentative Confirmed Rejected
library(knockoff)

results <- knockoff.filter(data, as.numeric(cluster_factors), fdr = 0.80)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
print(results)
## Call:
## knockoff.filter(X = data, y = as.numeric(cluster_factors), fdr = 0.8)
## 
## Selected variables:
##                         number_of_operations_selfreported_f136_0_0 
##                                                                  1 
##                                 peak_expiratory_flow_pef_f3064_2_1 
##                                                                  2 
##                                      triplet_played_left_f4229_0_9 
##                                                                  3 
##                                          ecg_stage_name_f5988_1_27 
##                                                                  5 
##                  total_peripheral_resistance_during_pwa_f12685_2_0 
##                                                                  6 
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0 
##                                                                  7 
##                date_of_first_inpatient_diagnosis_icd10_f41280_0_12 
##                                                                  9 
##                                 typical_diet_yesterday_f100020_4_0 
##                                                                 10